library(tidyverse)
library(httr)
library(jsonlite)
source("times.R")
source("io.R")
source("manipulate_data.R")
### get real-time data through API
url <- "https://maps2.dcgis.dc.gov/dcgis/rest/services/DCGIS_DATA/Transportation_WebMercator/MapServer/5/query?where=1%3D1&outFields=*&outSR=4326&f=json"
# 1) use the URL to make a request from the API
data_json <- GET(url = url,
user_agent("Georgetown University Assignment"))
# 2) Check for a server error in the response
http_status(data_json)
## $category
## [1] "Success"
##
## $reason
## [1] "OK"
##
## $message
## [1] "Success: (200) OK"
# 3) get the contents of the response as a text string
data_json <- content(data_json, as = "text")
# 4) create a character matrix from the JSON
data_matrix <- fromJSON(data_json)
# 5) turn the body of the character matrix into a tibble
realtime_data <- as_tibble(data_matrix$features$attributes)
#help needed: how to use lower case for var names?
### get historical data through CSVs
cbs_data <- read_files('data/')
data <- sep_departures_from_arrivals(cbs_data) %>%
filter_by_distance(from_station = 'lincoln_memorial', distance_m = 1600) # added filtering
hour_data <- get_station_hourly(data) %>%
get_historic_weather() # added weather
Amount of rides at the Lincoln memorial over all the time, and by various metrics
# The grouped data does not have the original date column, but it can be nice
# for visualizations. There is a function to do this in times.R
hour_data_date <- add_date_column(hour_data)
hour_data_date %>% # for me here there is a weird gap after July 2020, not sure why?
ggplot() +
geom_point(aes(x = date, y = lincoln_memorial,
color = type))
# by weekday
hour_data_date %>%
mutate(weekday = wday(date, label=TRUE)) %>%
ggplot() +
geom_point(aes(x = weekday, y = lincoln_memorial,
color=type)) # clearly people cycle more on saturdays and sundays
# by day of the month
hour_data_date %>%
ggplot() +
geom_point(aes(x = day, y = lincoln_memorial,
color=type)) # there is no clear pattern across days of the month - aka probably not a good predictor
# farah's code here although similar to first one here?
#hour_data_date %>%
#mutate(weekday = wday(date, label=TRUE)) %>%
#ggplot(aes(x = weekday, y = lincoln_memorial)) +
#geom_point(alpha = 0.25) +
#labs(title = "Capital bikeshare Ridership on Week days") +
#theme_minimal()
# by hour
hour_data_date %>%
ggplot() +
geom_point(aes(x = hour, y = lincoln_memorial,
color=type)) # people cycle more in the middle of the day, like between 1 and 7PM
hour_data_date %>%
ggplot() +
geom_point(aes(x = month, y = lincoln_memorial,
color=type)) # people cycle most in Spring, summer and fall. SUprised that there is no big spike in summer months
hour_data_date %>% # something is off with the year variable
ggplot() +
geom_point(aes(x = year, y = lincoln_memorial,
color=type)) # overall, more people use the service in 2021 than in 2020
# for a specific month and day, seeing variance by hour
hour_data_date %>%
filter(year == 2021, month == "Aug", day == 15) %>%
ggplot() +
geom_line(aes(x =hour, y = lincoln_memorial,
color=type))
# The below data can be used for a heat map.
coord_data <- data %>%
select(station, lat, lng) %>%
filter(duplicated(station) == FALSE)
heat_map_data <- data %>%
# filter() %>% Filter here if you want a certain date range
group_by(station, type) %>%
summarize(count = n()) %>%
pivot_wider(names_from = type, values_from = count) %>%
left_join(coord_data) %>%
filter(!is.na(lat), !is.na(lng))
## `summarise()` has grouped output by 'station'. You can override using the `.groups` argument.
## Joining, by = "station"
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
states <- tigris::states(cb = TRUE, progress_bar = FALSE) %>%
st_crop(xmin = -77.4, xmax = -76.8,
ymin = 38.75, ymax = 39.15)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
dep_sf <- heat_map_data %>%
st_as_sf(coords=c("lng", "lat"), remove = FALSE) %>%
st_set_crs(value = 4326)
dep_sf %>%
ggplot() +
geom_sf(data = states, fill = NA ) +
geom_sf(aes(color = departure), alpha = 0.3)
dep_sf %>%
ggplot() +
geom_sf(data = states, fill = NA ) +
geom_sf(aes(color = arrival), alpha = 0.3)
dep_sf %>%
ggplot() +
geom_sf(data = states, fill = NA ) +
geom_sf(aes(color = departure - arrival), alpha = 0.3)
<<<<<<< Updated upstream
hour_data_date %>%
ggplot(aes(x=maximum_temperature,y=lincoln_memorial))+
geom_point(alpha=0.07,color='yellow')+
labs(x='Temperature', y= 'Hourly Departures and Arrivals')+
geom_smooth(method='lm',color= 'red')
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1098 rows containing non-finite values (stat_smooth).
## Warning: Removed 1098 rows containing missing values (geom_point).
hour_data_date %>%
ggplot(aes(x=maximum_temperature,y=lincoln_memorial))+
geom_point(alpha=0.07,color='orange')+
labs(x='Temperature', y= 'Hourly Departures and Arrivals')+
geom_smooth(method='lm',color= 'red') # as the maximum temperature goes up, departures and arrivals go up
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1098 rows containing non-finite values (stat_smooth).
## Warning: Removed 1098 rows containing missing values (geom_point).
# I got this code online so still want to improve this a little bit
# also notice that there is missing data
hour_data_date %>%
ggplot(aes(x=wind_speed,y=lincoln_memorial))+
geom_point(alpha=0.07,color='blue')+
labs(x='Wind Speed', y= 'Hourly Departures and Arrivals')
## Warning: Removed 1098 rows containing missing values (geom_point).
#geom_smooth(method='lm',color= 'red') # when I add this I get an upward trend, which is weird since it's a negative relationship?
# I got this code online so still want to improve this a little bit
# also notice that there is missing data
hour_data_date %>%
ggplot(aes(x=precipitation,y=lincoln_memorial))+
geom_point(alpha=0.07,color='blue')+
labs(x='Precipiation', y= 'Hourly Departures and Arrivals') +
geom_smooth(method='lm',color= 'red') # Negative relationship between bike usage and precipitation as expected
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1098 rows containing non-finite values (stat_smooth).
## Warning: Removed 1098 rows containing missing values (geom_point).
# definitely need to do some transformations to the rain variable as there doesn't seem to be a lot of variation?